Data was downloaded from kaggle.com.
packs<-c("tidyverse","caret","rmarkdown","rgeos",
"rnaturalearth","rnaturalearthdata","sf",
"ggspatial","maps","plotly","car")
invisible(sapply(packs,require,character=T))
df<-read.csv("covid-19-all.csv")
#### Subset only countries in the Continental United States
us<-df[df$Country.Region == "US" & df$Latitude > 24 & df$Latitude < 54 &
df$Longitude < -62 & df$Longitude > -126,]
####Free memory by deleting the unused dataframe
rm(df)
####Create a basic plot of the counties in the US
plot(us$Longitude,us$Latitude)
This is the basic shape of our data. It would look better with color indicating the number of cases and overlayed on an actual plot of the United States.
####Create a new vector for the ranges of confirmed cases data
casesRange<-list(c(seq(1,49,1)),c(seq(50,99,1)),c(seq(100,199,1)),c(seq(200,499,1)),
c(seq(500,999,1)),c(seq(1000,1999,1)),c(seq(2000,9999,1)),c(0))
us$ConfirmedRange<-NA
us$ConfirmedRange[us$Confirmed %in% casesRange[[8]]]<-"0"
us$ConfirmedRange[us$Confirmed %in% casesRange[[1]]]<-"1-49"
us$ConfirmedRange[us$Confirmed %in% casesRange[[2]]]<-"50-99"
us$ConfirmedRange[us$Confirmed %in% casesRange[[3]]]<-"100-199"
us$ConfirmedRange[us$Confirmed %in% casesRange[[4]]]<-"200-499"
us$ConfirmedRange[us$Confirmed %in% casesRange[[5]]]<-"500-999"
us$ConfirmedRange[us$Confirmed %in% casesRange[[6]]]<-"1,000-1,999"
us$ConfirmedRange[us$Confirmed %in% casesRange[[7]]]<-"2,000-9,999"
us$ConfirmedRange[us$Confirmed >= 10000]<-"10,000+"
us$ConfirmedRange<-as.factor(us$ConfirmedRange)
####Reorder the factor levels from least to greatest
us$ConfirmedRange<-factor(us$ConfirmedRange,levels = levels(us$ConfirmedRange)[c(1,2,8,5,7,9,3,6,4)])
####Plot the confirmed cases by their American Latitude/Longitude
colors<-c(rgb(127,255,0,maxColorValue = 255),rgb(34,139,34,maxColorValue = 255),rgb(0,100,0,maxColorValue = 255),
rgb(255,255,0,maxColorValue = 255),rgb(255,165,0,maxColorValue = 255),rgb(255,0,0,maxColorValue = 255),
rgb(139,0,0,maxColorValue = 255),rgb(128,0,128,maxColorValue = 255),rgb(0,0,0,maxColorValue = 255))
world <- ne_countries(scale = "medium", returnclass = "sf")
states <- st_as_sf(map("state", plot = FALSE, fill = TRUE))
states <- cbind(states, st_coordinates(st_centroid(states)))
ggplot(data = world)+
geom_sf()+
geom_sf(data = states, fill = NA)+
coord_sf(xlim = c(-126, -70), ylim = c(24, 50), expand = FALSE)+
geom_point(data = us, shape = 15, aes(x = Longitude, y = Latitude, group = Province.State,
color = ConfirmedRange), size = 1)+
scale_color_manual(values = colors)+
labs(x = "Latitude", y = "Longitude", title = "Confirmed Cases by State")+
theme_void()
Next, we can look at the number of deaths per report in each state.
####We can reuse the code from before changing it to count deaths instead in a new column
us$Deaths[is.na(us$Deaths)]<-0
us$DeathsRange<-NA
us$DeathsRange[us$Deaths %in% casesRange[[8]]]<-"0"
us$DeathsRange[us$Deaths %in% casesRange[[1]]]<-"1-49"
us$DeathsRange[us$Deaths %in% casesRange[[2]]]<-"50-99"
us$DeathsRange[us$Deaths %in% casesRange[[3]]]<-"100-199"
us$DeathsRange[us$Deaths %in% casesRange[[4]]]<-"200-499"
us$DeathsRange[us$Deaths %in% casesRange[[5]]]<-"500-999"
us$DeathsRange[us$Deaths %in% casesRange[[6]]]<-"1,000-1,999"
us$DeathsRange[us$Deaths %in% casesRange[[7]]]<-"2,000-9,999"
us$DeathsRange[us$Deaths >= 10000]<-"10,000+"
us$DeathsRange<-as.factor(us$DeathsRange)
us$DeathsRange<-factor(us$DeathsRange,levels = levels(us$DeathsRange)[c(1,2,8,5,7,9,3,6,4)])
####We can use largely the same plot code from before to plot the deaths
ggplot(data = world)+
geom_sf()+
geom_sf(data = states, fill = NA)+
coord_sf(xlim = c(-126, -70), ylim = c(24, 50), expand = FALSE)+
geom_point(data = us, shape = 15, aes(x = Longitude, y = Latitude, group = Province.State,
color = DeathsRange), size = 1)+
scale_color_manual(values = colors)+
labs(x = "Latitude", y = "Longitude", title = "Deaths by State")+
theme_void()
When comparing the cases to deaths, most of the same areas light up, but at a lower rate than expected
Going beyond static images, We can also use plotly to generate an interactive plot that supports zooming and subsetting of data to show multiple points at once.
####First, we need to create unique IDs for cities that have reported multiple times.
us$Loc<-paste(us$Latitude,us$Longitude)
us$Loc<-as.factor(us$Loc)
####Then we take only the highest number for cumulative confirmed, deaths, and recovered.
us$Date<-as.Date(us$Date)
usg<- us %>%
arrange(desc(Date)) %>%
group_by(Loc) %>%
top_n(1,Date) ####Select only the latest date for each location
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showland = TRUE,
landcolor = toRGB("gray70"),
subunitcolor = toRGB("gray85"),
countrycolor = toRGB("gray85"),
countrywidth = 0.5,
subunitwidth = 0.5
)
fig <- plot_geo(usg, lat = ~Latitude, lon = ~Longitude, colors = colors)
fig <- fig %>% add_markers(
text = ~paste(Province.State,paste("Confirmed",Confirmed),paste("Deaths",Deaths),paste("Recovered",Recovered), sep = "<br />"),
color = ~ConfirmedRange,
symbol = I("square"),
size = I(8),
hoverinfo = "text",
)
fig <- fig %>% layout(
title = 'Coronavirus Cases in the United States<br />(Hover for totals)', geo = g
)
##This interactive plotly figure displays the total confirmed cases, deaths, and recovered in the United States.
##The plot can be dragged and zoomed and data can be hidden by severity group.
fig
We should take the rate of death into consideration. I compared the rate of confirmed cases to the rate deaths to get a mortality rate for the data. Based on the chart below, it appears that there may be some relationship between the number of cases and the rate of death.
####Create a new column
usg$Rate<-round(usg$Deaths/usg$Confirmed,3)*100
usg$Rate[is.nan(usg$Rate) | is.infinite(usg$Rate)]<-0.0
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showland = TRUE,
landcolor = toRGB("gray70"),
subunitcolor = toRGB("gray85"),
countrycolor = toRGB("gray85"),
countrywidth = 0.5,
subunitwidth = 0.5
)
fig <- plot_geo(usg, lat = ~Latitude, lon = ~Longitude)
fig <- fig %>% add_markers(
text = ~paste(Province.State,paste("Confirmed",Confirmed),paste("Deaths",Deaths),paste0("Mortality Rate ",Rate,"%"), sep = "<br />"),
color = ~Rate,
symbol = I("square"),
marker = list(
size = ~I(log(Confirmed,base = exp(.5)))
),
hoverinfo = "text",
colors = c("grey85","red","black")
)
fig <- fig %>% layout(
title = 'Coronavirus Mortality Rates in the United States<br />(Hover for totals)', geo = g
)
colorbar(fig,limits=c(0,13))
Marker sizes for each location in the above graph correspond to the number of confirmed cases to mitigate the visual effect of low incidence rate, high mortality rate points have on the graph. This also helps distinguish between points on the graph when zooming in. The darkest color represents a 13% mortality rate, which are between 2 and 3.5 times the projected mortality rate (1).
The graphs suggest that there is a relationship between the number of incidents and the number of deaths such that as incidents increase, so does the death percentage. Below is a scatterplot of the relationship between incidents and deaths. Note that regions that had greater than 99% percent mortality rate (7) or greater than 20,000 cases (6) for visibility purposes.
####Exclude Outliers
gset<-usg[usg$Rate<99 & usg$Confirmed < 20000,]
gg<-ggplot(gset,aes(x = Confirmed, y = Rate))
gg+geom_point()+theme_bw()+
labs(x = "Confirmed Cases", y = "Mortality Rate (%)",title = "Mortality Rate Compared to Confirmed Cases")+
scale_y_continuous(breaks = seq(0,60,5))+
scale_x_continuous(breaks = seq(0,20000,1000))+
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = .5))
The above graph displays that as cases increase, the actual mortality rate regresses toward the mean and variability in mortality rate decreases as we would expect.
The correlation between the two variables is low (r = 0.10). The residuals vs. fitted plot shows that the data violates the assumption of homogeneity of variance between the two variables. This means that even if the two variables had a strong correlation, simple linear regression would still be innapropriate.
cor(gset$Rate,gset$Confirmed)
## [1] 0.1014105
lm1<-lm(Rate~Confirmed,data=gset)
plot(lm1,c(3))
What this data does tell us is that the virus has
The next question is how the virus is being handled on a state-by-state basis. the following graph shows the mortality rate by state. The following graph is rather dense so maximizing this document may help view it.
####Remove non-state locations from the data and Alaska (0 Results which leads to errors)
gset2<- usg %>%
filter(Rate < 99, str_detect(Province.State,",|US|Grand|Alaska",negate = T))
####Create a variable to store the average death rate from every state. Note that this differs from simply taking the average mortality rate from each county.
a<- gset2 %>%
group_by(Province.State) %>%
summarise(avg = sum(Deaths)/sum(Confirmed)*100) %>%
as.data.frame(arrange(desc(avg)))
a<-a[rev(order(a$avg)),]
####Sum the confirmed cases for each state
gset2$Province.State <-as.factor(gset2$Province.State)
gset2$Province.State<-factor(gset2$Province.State,levels = a$Province.State)
conf<-gset2 %>%
group_by(Province.State) %>%
summarise(Total = sum(Confirmed)) %>%
as.data.frame(.)
gg<-ggplot(gset2,aes(x = Province.State, y = Rate))
gg+geom_count()+theme_bw()+
geom_hline(yintercept = 6,linetype = "dotted",color = "grey20")+
geom_point(data=a, aes(x = Province.State, y = avg),color="red",size = 2,shape = 15)+
geom_text(data=a, aes(x = Province.State, y = -3),color="blue",label = paste(conf$Total),size = 3.2,angle= 90)+
theme(axis.text.x = element_text(angle = 90))+
scale_y_continuous(breaks = seq(0,60,5))+
labs(x = "State",y = "Mortality Rate (%)", title = "United States Covid-19 Mortality Rate by State (Mortality Rate in Red, Confirmed Cases in Blue)",
size = "Counties")+
annotate("text",label = "bold('May 15th, 2020 \n US Confirmed Cases: 1,481,261 \n US Deaths: 88,651 \n US Mortality Rate: 6.0%')",parse=T,hjust=0,y = 50, x = 1)